--- title: OCO2 - Count Cities and Plants inside a capture zone around the peak keywords: fastai sidebar: home_sidebar ---
{% raw %}
{% endraw %} {% raw %}
!pip install --user python-swiftclient python-keystoneclient --upgrade
!apt-get install swiftclient
!pip install geopandas

#Solve geopandas rtee problemm for sjoin
!apt-get install -y libspatialindex-dev
!pip install rtree
{% endraw %} {% raw %}
config_path = "./config.json"
{% endraw %}

Introduction

{% raw %}
import pandas as pd
import geopandas as gpd
import numpy as np
from numpy import exp, loadtxt, pi, sqrt
import math
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import swiftclient
import json
from io import StringIO
import folium
from folium import plugins
import geopy
from shapely.geometry import Polygon
from geopy.distance import VincentyDistance
{% endraw %}

We establish the connexion with the swift server to pull and push ressources.

{% raw %}
with open(config_path) as json_data_file:
    config = json.load(json_data_file)
    
def swift_con(config):
    user=config['swift_storage']['user']
    key=config['swift_storage']['key']
    auth_url=config['swift_storage']['auth_url']
    tenant_name=config['swift_storage']['tenant_name']
    auth_version=config['swift_storage']['auth_version']
    options = config['swift_storage']['options']
    return swiftclient.Connection(user=user,
                                  key=key,
                                  authurl=auth_url,
                                  os_options=options,
                                  tenant_name=tenant_name,
                                  auth_version=auth_version)

conn = swift_con(config)
{% endraw %}

Retrieve Data

Gaussian Peak Detection

The CSV contains the resulting detected peaks of the Gaussian method implemented in FC's paper, reproduced by Benoit. The CSV is stored on the Swift server.

We retrieve here the August 2018 data as an exemple.

{% raw %}
csv = conn.get_object("oco2", "/datasets/oco-2/peaks-detected/result_for_oco2_1808.csv")[1]
peak_fc = pd.read_csv(StringIO(str(csv, 'utf-8')),index_col=0)
peak_fc.head()
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure_apriori surface_pressure altitude land_water_indicator land_fraction
0 2018080101035604 25.425072 -177.345490 21709 0.000430 404.902899 -14.694226 -3.987167 1.470254 0.677812 -7.759225 -0.739198 1016.253113 1018.198181 0.000000 1.0 0.0
1 2018080101062035 33.441360 -179.630508 21709 -0.002734 404.470875 18.128640 3.086571 2.343144 -0.656713 -4.649583 -0.913302 1022.977173 1023.379456 0.000000 1.0 0.0
2 2018080101062937 33.919857 -179.779083 21709 -0.006755 404.416030 -11.847855 -3.346435 1.412432 0.592854 -4.035365 -0.879741 1023.283203 1026.469604 0.000000 1.0 0.0
3 2018080111005177 37.444622 30.755703 21715 0.006885 404.427158 64.732216 11.112728 2.323860 0.654418 0.284106 -0.315598 963.941162 964.808594 367.765533 0.0 100.0
4 2018080220003337 44.050335 -106.099586 21735 -0.002019 405.980164 48.714267 14.632683 1.328135 0.528694 1.296467 -3.582739 859.926086 863.947021 1374.157227 0.0 100.0
{% endraw %}

We spot some negative sigma and amplitude parameters and convert the sounding_id into a datetime variable date:

{% raw %}
from datetime import datetime
def to_date(a):
    return datetime.strptime(str(a), '%Y%m%d%H%M%S%f')

peak_fc['date'] = peak_fc['sounding_id'].apply(to_date)
peak_fc['sigma'] = peak_fc['sigma'].apply(abs)
peak_fc['amplitude'] = peak_fc['amplitude'].apply(abs)
peak_fc.head()
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure_apriori surface_pressure altitude land_water_indicator land_fraction date
0 2018080101035604 25.425072 -177.345490 21709 0.000430 404.902899 14.694226 3.987167 1.470254 0.677812 -7.759225 -0.739198 1016.253113 1018.198181 0.000000 1.0 0.0 2018-08-01 01:03:56.040
1 2018080101062035 33.441360 -179.630508 21709 -0.002734 404.470875 18.128640 3.086571 2.343144 -0.656713 -4.649583 -0.913302 1022.977173 1023.379456 0.000000 1.0 0.0 2018-08-01 01:06:20.350
2 2018080101062937 33.919857 -179.779083 21709 -0.006755 404.416030 11.847855 3.346435 1.412432 0.592854 -4.035365 -0.879741 1023.283203 1026.469604 0.000000 1.0 0.0 2018-08-01 01:06:29.370
3 2018080111005177 37.444622 30.755703 21715 0.006885 404.427158 64.732216 11.112728 2.323860 0.654418 0.284106 -0.315598 963.941162 964.808594 367.765533 0.0 100.0 2018-08-01 11:00:51.770
4 2018080220003337 44.050335 -106.099586 21735 -0.002019 405.980164 48.714267 14.632683 1.328135 0.528694 1.296467 -3.582739 859.926086 863.947021 1374.157227 0.0 100.0 2018-08-02 20:00:33.370
{% endraw %}

We transform it into a GeoDataFrame :

{% raw %}
peak_fc = gpd.GeoDataFrame(peak_fc, geometry=gpd.points_from_xy(peak_fc.longitude, peak_fc.latitude)).copy()
peak_fc.crs = {'init': 'epsg:4326'}
peak_fc.head()
/usr/local/lib/python3.6/dist-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure_apriori surface_pressure altitude land_water_indicator land_fraction date geometry
0 2018080101035604 25.425072 -177.345490 21709 0.000430 404.902899 14.694226 3.987167 1.470254 0.677812 -7.759225 -0.739198 1016.253113 1018.198181 0.000000 1.0 0.0 2018-08-01 01:03:56.040 POINT (-177.34549 25.42507)
1 2018080101062035 33.441360 -179.630508 21709 -0.002734 404.470875 18.128640 3.086571 2.343144 -0.656713 -4.649583 -0.913302 1022.977173 1023.379456 0.000000 1.0 0.0 2018-08-01 01:06:20.350 POINT (-179.63051 33.44136)
2 2018080101062937 33.919857 -179.779083 21709 -0.006755 404.416030 11.847855 3.346435 1.412432 0.592854 -4.035365 -0.879741 1023.283203 1026.469604 0.000000 1.0 0.0 2018-08-01 01:06:29.370 POINT (-179.77908 33.91986)
3 2018080111005177 37.444622 30.755703 21715 0.006885 404.427158 64.732216 11.112728 2.323860 0.654418 0.284106 -0.315598 963.941162 964.808594 367.765533 0.0 100.0 2018-08-01 11:00:51.770 POINT (30.75570 37.44462)
4 2018080220003337 44.050335 -106.099586 21735 -0.002019 405.980164 48.714267 14.632683 1.328135 0.528694 1.296467 -3.582739 859.926086 863.947021 1374.157227 0.0 100.0 2018-08-02 20:00:33.370 POINT (-106.09959 44.05033)
{% endraw %}

LOF & DBSCAN Filtered Peaks

The CSV contains the same peaks (same month) filtered by a LOF and a DBSCAN by CHarlotte and Raphaele. The CSV is stored on the GitHub.

{% raw %}
# Charlotte & Raphaele's method
path_peaks = "https://raw.githubusercontent.com/dataforgoodfr/batch7_satellite_ges/master/dataset/output/peaks_out_1808.csv"
peak_cr = pd.read_csv(path_peaks, sep=",")
peak_cr.head()
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure surface_pressure_apriori land_water_indicator land_fraction surf_pres y_class_lof outlier_score_lof y_class_dbscan y_class_lof_only_gaussian_param y_class_dbscan_only_gaussian_param
0 2018080101035604 25.425072 -177.345490 21709 0.000430 404.902899 -14.694226 -3.987167 1.470254 0.677812 -7.759225 -0.739198 1018.198181 1016.253113 1.0 0.0 -1.945068 -1 -1.501400 -1 1 -1
1 2018080101060803 32.777554 -179.428162 21709 0.002886 404.622407 1.493849 7.887665 0.075556 0.532213 -5.420354 -0.992332 1025.610962 1022.500305 1.0 0.0 -3.110657 1 -1.021724 1 1 1
2 2018080101062035 33.441360 -179.630508 21709 -0.002734 404.470875 18.128640 3.086571 2.343144 -0.656713 -4.649583 -0.913302 1023.379456 1022.977173 1.0 0.0 -0.402283 1 -1.171819 -1 1 -1
3 2018080101062937 33.919857 -179.779083 21709 -0.006755 404.416030 -11.847855 -3.346435 1.412432 0.592854 -4.035365 -0.879741 1026.469604 1023.283203 1.0 0.0 -3.186401 1 -1.461872 -1 1 -1
4 2018080102530302 58.858116 143.781662 21710 0.003719 399.445056 -0.018193 -5.552475 0.001307 0.730703 -3.837832 0.812698 995.894531 995.617310 1.0 0.0 -0.277222 1 -1.290330 -1 -1 -1
{% endraw %} {% raw %}
peak_cr = gpd.GeoDataFrame(peak_cr, geometry=gpd.points_from_xy(peak_cr.longitude, peak_cr.latitude)).copy()
peak_cr.crs = {'init': 'epsg:4326'}
peak_cr.head()
/usr/local/lib/python3.6/dist-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure surface_pressure_apriori land_water_indicator land_fraction surf_pres y_class_lof outlier_score_lof y_class_dbscan y_class_lof_only_gaussian_param y_class_dbscan_only_gaussian_param geometry
0 2018080101035604 25.425072 -177.345490 21709 0.000430 404.902899 -14.694226 -3.987167 1.470254 0.677812 -7.759225 -0.739198 1018.198181 1016.253113 1.0 0.0 -1.945068 -1 -1.501400 -1 1 -1 POINT (-177.34549 25.42507)
1 2018080101060803 32.777554 -179.428162 21709 0.002886 404.622407 1.493849 7.887665 0.075556 0.532213 -5.420354 -0.992332 1025.610962 1022.500305 1.0 0.0 -3.110657 1 -1.021724 1 1 1 POINT (-179.42816 32.77755)
2 2018080101062035 33.441360 -179.630508 21709 -0.002734 404.470875 18.128640 3.086571 2.343144 -0.656713 -4.649583 -0.913302 1023.379456 1022.977173 1.0 0.0 -0.402283 1 -1.171819 -1 1 -1 POINT (-179.63051 33.44136)
3 2018080101062937 33.919857 -179.779083 21709 -0.006755 404.416030 -11.847855 -3.346435 1.412432 0.592854 -4.035365 -0.879741 1026.469604 1023.283203 1.0 0.0 -3.186401 1 -1.461872 -1 1 -1 POINT (-179.77908 33.91986)
4 2018080102530302 58.858116 143.781662 21710 0.003719 399.445056 -0.018193 -5.552475 0.001307 0.730703 -3.837832 0.812698 995.894531 995.617310 1.0 0.0 -0.277222 1 -1.290330 -1 -1 -1 POINT (143.78166 58.85812)
{% endraw %}

K-MEANS Filtered Peaks

{% raw %}
# Charlotte new kmeans method
path_peaks_kmean = "./data/peaks/peaks_out_1808_testkmeans.csv"
peak_kmean = pd.read_csv(path_peaks_kmean, sep=",")
peak_kmean.head()
orbit R amplitude delta intercept latitude longitude sigma slope sounding_id windspeed_u windspeed_v date latitude_orig longitude_orig distance surface_pressure surface_pressure_apriori land_water_indicator land_fraction surf_pres kmeans_cluster
0 21709 0.532213 1.493853 0.075556 404.622407 32.777554 -179.428162 7.887675 0.002886 2018080101060803 -5.420354 -0.992332 2018-08-01 01:06:08.030 25.425072 -177.345490 841.633513 1025.610962 1022.500305 1.0 0.0 -3.110657 0
1 21711 0.577045 9.859754 0.159146 405.367275 35.680077 130.219467 24.716167 -0.004196 2018080104244674 -2.515345 -1.818006 2018-08-01 04:24:46.740 35.680077 130.219467 0.000000 1005.533569 1003.426147 1.0 0.0 -2.107422 3
2 21711 0.511861 7.635086 0.466758 405.341883 35.752354 130.197632 6.525779 -0.004529 2018080104244836 -2.536525 -1.833167 2018-08-01 04:24:48.360 35.680077 130.219467 8.269885 1006.090515 1003.472229 1.0 0.0 -2.618286 0
3 21711 0.533806 7.751707 0.322542 405.311837 35.789249 130.185776 9.587837 -0.004221 2018080104244906 -2.545387 -1.840344 2018-08-01 04:24:49.060 35.680077 130.219467 12.506704 1004.246460 1003.495850 1.0 0.0 -0.750610 0
4 21711 0.514400 9.293186 0.225309 405.271937 35.826172 130.173950 16.454931 -0.003825 2018080104244976 -2.552335 -1.846606 2018-08-01 04:24:49.760 35.680077 130.219467 16.745752 1005.441772 1003.519531 1.0 0.0 -1.922241 0
{% endraw %} {% raw %}
peak_kmean = gpd.GeoDataFrame(peak_kmean, geometry=gpd.points_from_xy(peak_kmean.longitude, peak_kmean.latitude)).copy()
peak_kmean.crs = {'init': 'epsg:4326'}
peak_kmean.describe()
/usr/local/lib/python3.6/dist-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
orbit R amplitude delta intercept latitude longitude sigma slope sounding_id windspeed_u windspeed_v latitude_orig longitude_orig distance surface_pressure surface_pressure_apriori land_water_indicator land_fraction surf_pres kmeans_cluster
count 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2.528000e+03 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000 2528.000000
mean 21942.228244 0.599660 16.378871 0.403746 404.221340 20.335576 -3.532016 16.841955 -0.001384 2.018082e+15 -1.199901 -0.276043 0.335266 -7.246776 2282.784891 994.598029 992.967822 0.669699 33.910997 -1.630207 1.505538
std 135.969533 0.082657 16.691320 0.428862 1.979237 26.483743 105.474387 6.615117 0.006493 9.345091e+08 4.443105 3.740615 25.328278 106.404332 2791.976220 45.705697 45.509663 0.490188 47.312737 2.632989 1.337701
min 21709.000000 0.500107 0.010174 0.000341 399.106487 -40.021149 -179.523438 2.027055 -0.024920 2.018080e+15 -9.672325 -9.554490 -40.021149 -179.427689 0.000000 738.274963 733.451233 0.000000 0.000000 -11.428528 0.000000
25% 21809.000000 0.533753 5.769051 0.152905 402.834422 -3.690342 -112.089016 11.687496 -0.005904 2.018081e+15 -4.772796 -3.251342 -20.239977 -116.776398 27.543121 995.949432 994.709412 0.000000 0.000000 -3.348526 0.000000
50% 21957.000000 0.576914 12.248867 0.294233 404.558087 29.462646 16.444787 16.116074 -0.002801 2.018082e+15 -1.935188 -0.219496 -5.350143 8.237496 906.387661 1013.685760 1011.713654 1.000000 0.000000 -2.003723 2.000000
75% 22061.000000 0.646172 21.686864 0.506560 405.946240 40.454978 67.995306 21.275639 0.003901 2.018083e+15 2.315969 2.955302 22.579563 66.774048 4083.128058 1017.459961 1015.602310 1.000000 100.000000 -0.051285 3.000000
max 22159.000000 0.899008 162.436539 5.603435 408.913894 72.424393 179.568573 33.249878 0.025194 2.018083e+15 13.551064 11.974917 71.035431 179.295044 11758.590148 1035.291138 1032.145752 3.000000 100.000000 7.868530 4.000000
{% endraw %}

Cities estimates

The CSV contains data from the worlds biggest cities. The CSV is stored on the GitHub.

{% raw %}
path_cities = "https://raw.githubusercontent.com/dataforgoodfr/batch7_satellite_ges/master/dataset/cities_v1.csv"
cities = pd.read_csv(path_cities, sep=",", index_col=0)
cities.head()
City name Country Scope-1 GHG emissions [tCO2 or tCO2-eq] Scope-1 source dataset Scope-1 GHG emissions units Year of emission City location (CDP) [degrees] Population (CDP) Population year (CDP) latitude longitude
0 Toronto Canada 16151019.0 CDP tCO2 2013 43.653226,-79.3831843 2753100.0 2011.0 43.653226 -79.383184
1 Santiago de Cali Colombia NaN CDP tCO2-eq 2010 3.451647,-76.531985 2369829.0 2015.0 3.451647 -76.531985
4 Milano Italy 3728678.0 CDP tCO2 2013 45.802578,9.086356 1350680.0 2014.0 45.802578 9.086356
5 Hayward USA 861854.0 CDP tCO2-eq 2015 37.6689,-122.0808 158985.0 2015.0 37.668900 -122.080800
6 Tokyo Japan 27611000.0 CDP tCO2-eq 2014 35.6896342,139.6921007 13513734.0 2015.0 35.689634 139.692101
{% endraw %} {% raw %}
cities = gpd.GeoDataFrame(cities, geometry=gpd.points_from_xy(cities.longitude, cities.latitude))
cities.crs = {'init': 'epsg:4326'}
cities.head()
/usr/local/lib/python3.6/dist-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
City name Country Scope-1 GHG emissions [tCO2 or tCO2-eq] Scope-1 source dataset Scope-1 GHG emissions units Year of emission City location (CDP) [degrees] Population (CDP) Population year (CDP) latitude longitude geometry
0 Toronto Canada 16151019.0 CDP tCO2 2013 43.653226,-79.3831843 2753100.0 2011.0 43.653226 -79.383184 POINT (-79.38318 43.65323)
1 Santiago de Cali Colombia NaN CDP tCO2-eq 2010 3.451647,-76.531985 2369829.0 2015.0 3.451647 -76.531985 POINT (-76.53198 3.45165)
4 Milano Italy 3728678.0 CDP tCO2 2013 45.802578,9.086356 1350680.0 2014.0 45.802578 9.086356 POINT (9.08636 45.80258)
5 Hayward USA 861854.0 CDP tCO2-eq 2015 37.6689,-122.0808 158985.0 2015.0 37.668900 -122.080800 POINT (-122.08080 37.66890)
6 Tokyo Japan 27611000.0 CDP tCO2-eq 2014 35.6896342,139.6921007 13513734.0 2015.0 35.689634 139.692101 POINT (139.69210 35.68963)
{% endraw %}

Power Plants sources

The CSV contains data from the worlds biggest power plants. The CSV is stored on the GitHub.

{% raw %}
path_plants = "https://raw.githubusercontent.com/dataforgoodfr/batch7_satellite_ges/master/dataset/CO2_emissions_centrale.csv"
plants = pd.read_csv(path_plants, sep=",", index_col=0)
plants.head()
country_long capacity_mw latitude longitude primary_fuel generation_gwh_2013 generation_gwh_2014 generation_gwh_2015 generation_gwh_2016 generation_gwh_2017 estimated_generation_gwh gCO2/KWh generation_gwh_2017_with_estimated_data tCO2_emitted_in_2013 tCO2_emitted_in_2014 tCO2_emitted_in_2015 tCO2_emitted_in_2016 tCO2_emitted_in_2017 tCO2_emitted_in_2017_with estimated_data
0 Afghanistan 42.0 34.5638 69.1134 Gas NaN NaN NaN NaN NaN NaN 443 NaN NaN NaN NaN NaN NaN NaN
1 Algeria 520.0 35.8665 6.0262 Gas NaN NaN NaN NaN NaN 2152.249819 443 2152.249819 NaN NaN NaN NaN NaN 9.534467e+05
2 Algeria 71.0 36.8924 7.7634 Gas NaN NaN NaN NaN NaN 293.864879 443 293.864879 NaN NaN NaN NaN NaN 1.301821e+05
3 Algeria 560.0 36.5988 3.1375 Gas NaN NaN NaN NaN NaN 2317.807497 443 2317.807497 NaN NaN NaN NaN NaN 1.026789e+06
4 Algeria 100.0 36.5914 2.9223 Gas NaN NaN NaN NaN NaN 413.894196 443 413.894196 NaN NaN NaN NaN NaN 1.833551e+05
{% endraw %} {% raw %}
plants = gpd.GeoDataFrame(plants, geometry=gpd.points_from_xy(plants.longitude, plants.latitude))
plants.crs = {'init': 'epsg:4326'}
plants.head()
/usr/local/lib/python3.6/dist-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
country_long capacity_mw latitude longitude primary_fuel generation_gwh_2013 generation_gwh_2014 generation_gwh_2015 generation_gwh_2016 generation_gwh_2017 estimated_generation_gwh gCO2/KWh generation_gwh_2017_with_estimated_data tCO2_emitted_in_2013 tCO2_emitted_in_2014 tCO2_emitted_in_2015 tCO2_emitted_in_2016 tCO2_emitted_in_2017 tCO2_emitted_in_2017_with estimated_data geometry
0 Afghanistan 42.0 34.5638 69.1134 Gas NaN NaN NaN NaN NaN NaN 443 NaN NaN NaN NaN NaN NaN NaN POINT (69.11340 34.56380)
1 Algeria 520.0 35.8665 6.0262 Gas NaN NaN NaN NaN NaN 2152.249819 443 2152.249819 NaN NaN NaN NaN NaN 9.534467e+05 POINT (6.02620 35.86650)
2 Algeria 71.0 36.8924 7.7634 Gas NaN NaN NaN NaN NaN 293.864879 443 293.864879 NaN NaN NaN NaN NaN 1.301821e+05 POINT (7.76340 36.89240)
3 Algeria 560.0 36.5988 3.1375 Gas NaN NaN NaN NaN NaN 2317.807497 443 2317.807497 NaN NaN NaN NaN NaN 1.026789e+06 POINT (3.13750 36.59880)
4 Algeria 100.0 36.5914 2.9223 Gas NaN NaN NaN NaN NaN 413.894196 443 413.894196 NaN NaN NaN NaN NaN 1.833551e+05 POINT (2.92230 36.59140)
{% endraw %}

Coal Plants sources

The CSV contains data from the worlds biggest coal plants. The CSV is stored on the GitHub.

{% raw %}
path_plants_coal = "https://raw.githubusercontent.com/dataforgoodfr/batch7_satellite_ges/master/dataset/CO2_emissions_coal_plant.csv"
plants_coal = pd.read_csv(path_plants_coal, sep=",", index_col=0)
plants_coal.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12875 entries, 0 to 12874
Data columns (total 17 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Country                                            12875 non-null  object 
 1   Plant                                              12873 non-null  object 
 2   Capacity (MW)                                      12866 non-null  object 
 3   Status                                             12875 non-null  object 
 4   Status 2016 (done by Data4Good)                    12875 non-null  object 
 5   Status 2017 (done by Data4Good)                    12875 non-null  object 
 6   Status 2018 (done by Data4Good)                    12875 non-null  object 
 7   Status 2019 (done by Data4Good)                    12875 non-null  object 
 8   Combustion technology                              12875 non-null  object 
 9   Coal type                                          12875 non-null  object 
 10  Latitude                                           12754 non-null  float64
 11  Longitude                                          12754 non-null  float64
 12  Annual CO2 (million tonnes / annum)                12871 non-null  float64
 13  Annual CO2 emissions (millions of tonnes) in 2016  6758 non-null   float64
 14  Annual CO2 emissions (millions of tonnes) in 2017  6785 non-null   float64
 15  Annual CO2 emissions (millions of tonnes) in 2018  6778 non-null   float64
 16  Annual CO2 emissions (millions of tonnes) in 2019  6825 non-null   float64
dtypes: float64(7), object(10)
memory usage: 1.8+ MB
{% endraw %} {% raw %}
plants_coal = plants_coal[plants_coal['Longitude'].notna()]
plants_coal = plants_coal[plants_coal['Latitude'].notna()]
plants_coal.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12754 entries, 0 to 12874
Data columns (total 17 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Country                                            12754 non-null  object 
 1   Plant                                              12752 non-null  object 
 2   Capacity (MW)                                      12745 non-null  object 
 3   Status                                             12754 non-null  object 
 4   Status 2016 (done by Data4Good)                    12754 non-null  object 
 5   Status 2017 (done by Data4Good)                    12754 non-null  object 
 6   Status 2018 (done by Data4Good)                    12754 non-null  object 
 7   Status 2019 (done by Data4Good)                    12754 non-null  object 
 8   Combustion technology                              12754 non-null  object 
 9   Coal type                                          12754 non-null  object 
 10  Latitude                                           12754 non-null  float64
 11  Longitude                                          12754 non-null  float64
 12  Annual CO2 (million tonnes / annum)                12750 non-null  float64
 13  Annual CO2 emissions (millions of tonnes) in 2016  6740 non-null   float64
 14  Annual CO2 emissions (millions of tonnes) in 2017  6769 non-null   float64
 15  Annual CO2 emissions (millions of tonnes) in 2018  6762 non-null   float64
 16  Annual CO2 emissions (millions of tonnes) in 2019  6810 non-null   float64
dtypes: float64(7), object(10)
memory usage: 1.8+ MB
{% endraw %} {% raw %}
plants_coal = gpd.GeoDataFrame(plants_coal, geometry=gpd.points_from_xy(plants_coal.Longitude, plants_coal.Latitude))
plants_coal.crs = {'init': 'epsg:4326'}
plants_coal.head()
/usr/local/lib/python3.6/dist-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
Country Plant Capacity (MW) Status Status 2016 (done by Data4Good) Status 2017 (done by Data4Good) Status 2018 (done by Data4Good) Status 2019 (done by Data4Good) Combustion technology Coal type Latitude Longitude Annual CO2 (million tonnes / annum) Annual CO2 emissions (millions of tonnes) in 2016 Annual CO2 emissions (millions of tonnes) in 2017 Annual CO2 emissions (millions of tonnes) in 2018 Annual CO2 emissions (millions of tonnes) in 2019 geometry
0 Albania Porto Romano Power Station 800 Cancelled Not operating Not operating Not operating Not operating Ultra-super Bituminous 41.37114 19.4252 3.217105 NaN NaN NaN NaN POINT (19.42520 41.37114)
1 Argentina Río Turbio power station 120 Mothballed Not operating Not operating Not operating Not operating Subcritical Bituminous -51.54600 -72.2313 0.609181 NaN NaN NaN NaN POINT (-72.23130 -51.54600)
2 Argentina Río Turbio power station 120 Shelved Not operating Not operating Not operating Not operating Subcritical Bituminous -51.54600 -72.2313 0.609181 NaN NaN NaN NaN POINT (-72.23130 -51.54600)
3 Argentina San Nicolás power station 350 Operating Operating Operating Operating Operating Subcritical Bituminous -33.35490 -60.1729 1.998875 1.998875 1.998875 1.998875 1.998875 POINT (-60.17290 -33.35490)
4 Australia Anglesea power station 160 Retired Not operating Not operating Not operating Not operating Subcritical Lignite -38.38650 144.1821 1.047857 NaN NaN NaN NaN POINT (144.18210 -38.38650)
{% endraw %}

Inventory Capture

{% raw %}
def get_direction_from_uv(u, v):
    ''' Retrieve the heading of a vector'''
    direction = 180/math.pi * math.atan2(u,v)+180
    return direction

def get_wind_norm_from_uv(u, v):
    ''' Retrieve the magitude of a vector'''
    return math.sqrt(pow(u,2)+pow(v,2))

def get_new_coord(lat, lon, d, b):
    ''' Calculate the arrival point of a vector, given a starting point, a distance and a direcion'''
    origin = geopy.Point(lat, lon)
    point = VincentyDistance(kilometers=d).destination(origin, b)
    return [point[1], point[0]]

def capture_zone(lat, lon, u, v, angle=50):
    ''' Calculates the capture zone around a point, given the point, a wind vector and a angme to shape the zone'''
    wind_heading = get_direction_from_uv(u, v)
    wind_norm = get_wind_norm_from_uv(u,v)*3.6

    # BACK LINE
    # 1st point (back - 6h wind)
    point_1 = get_new_coord(lat, lon, wind_norm*6, wind_heading+180)
    # 2nd point (back - 6h wind - 50°)
    point_2 = get_new_coord(lat, lon, wind_norm*6, wind_heading+180-angle)
    # 3rd point (back - 6h wind - 50°)
    point_3 = get_new_coord(lat, lon, wind_norm*6, wind_heading+180+angle)

    # FRONT LINE
    # 4th point (front - 24h wind - 20°)
    point_4 = get_new_coord(lat, lon, wind_norm*24, wind_heading+angle)
    # 5th point (front - 24h wind - 20°)
    point_5 = get_new_coord(lat, lon, wind_norm*24, wind_heading-angle)
    # 6th point (front - 24h wind)
    point_6 = get_new_coord(lat, lon, wind_norm*24, wind_heading)

    points = [point_1, point_2, point_4, point_6, point_5, point_3, point_1]
    return points

def capture_df(row):
    ''' Apply the capture zone function to a dataset row'''
    return Polygon(capture_zone(row["latitude"], row["longitude"], row['windspeed_u'], row['windspeed_v']))

def join_and_count(peaks, cities, plants):
    ''' Spacially join 3 datasets (Polygon, Point and Point)'''
    #intersect cities
    peaks_intersect_invent  = gpd.sjoin(peaks,cities.loc[:, ['Population (CDP)','geometry']], how='left', op='intersects').rename(columns={"index_right": "index_cities"})
    peaks_intersect_ag = peaks_intersect_invent.groupby([peaks_intersect_invent.index, 'sounding_id'])["index_cities"].apply(list).reset_index()
    peaks_intersect_ag["number_cities"] = peaks_intersect_ag["index_cities"].apply(lambda x: np.count_nonzero(~np.isnan(x)))
    peaks_meters_cities = peaks.merge(peaks_intersect_ag, how='left', on='sounding_id')
    
    #intersect plants
    peaks_intersect_plants  = gpd.sjoin(peaks,plants.loc[:, [ 'estimated_generation_gwh', 'geometry']], how='left', op='intersects').rename(columns={"index_right": "index_plants"})
    peaks_intersect_ag = peaks_intersect_plants.groupby([peaks_intersect_plants.index, 'sounding_id'])["index_plants"].apply(list).reset_index()
    peaks_intersect_ag["number_plants"] = peaks_intersect_ag["index_plants"].apply(lambda x: np.count_nonzero(~np.isnan(x)))
    peaks_meters_plants = peaks.merge(peaks_intersect_ag, how='left', on='sounding_id')

    # join
    peaks_and_sources = peaks_meters_cities.merge(peaks_intersect_ag, how='left', on='sounding_id')
    peaks_and_sources = peaks_and_sources.drop(columns=['level_0_x', 'level_0_y'])
    return peaks_and_sources
{% endraw %}

Gaussian-only Peaks

We set a capture zone and spatially join the data for FC's peaks:

{% raw %}
# Remove far east and west to avoid side effects on captures zones 
peaks_fc = peak_fc.loc[(peak_fc["longitude"] < 175) & (peak_fc["longitude"] > -175), :]
peaks_fc['geometry'] = peaks_fc.apply(capture_df, axis=1)
peaks_fc_and_sources = join_and_count(peaks_fc, cities, plants)
peaks_fc_and_sources.head(3)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure_apriori surface_pressure altitude land_water_indicator land_fraction date geometry index_cities number_cities index_plants number_plants
0 2018080111005177 37.444622 30.755703 21715 0.006885 404.427158 64.732216 11.112728 2.323860 0.654418 0.284106 -0.315598 963.941162 964.808594 367.765533 0.0 100.0 2018-08-01 11:00:51.770 POLYGON ((30.82499 37.38318, 30.85930 37.44745, 30.81370 37.77195, 30.47740 37.68998, 30.34141 37.43239, 30.74128 37.36278, 30.82499 37.38318)) [nan] 0 [nan] 0
1 2018080220003337 44.050335 -106.099586 21735 -0.002019 405.980164 48.714267 14.632683 1.328135 0.528694 1.296467 -3.582739 859.926086 863.947021 1374.157227 0.0 100.0 2018-08-02 20:00:33.370 POLYGON ((-105.75418 43.35330, -105.13937 43.79166, -103.94523 46.59303, -107.56714 46.82641, -110.02169 45.00211, -106.60924 43.40841, -105.75418 43.35330)) [nan] 0 [6132.0, 6133.0, 7236.0, 5767.0, 6707.0, 7370.0, 7371.0, 8468.0, 8471.0, 8470.0, 8469.0, 6299.0, 5720.0, 6357.0, 6692.0, 7598.0] 16
2 2018080408145507 40.436111 71.393532 21757 0.024100 403.094724 43.217484 16.532307 1.042884 0.811078 3.384520 -1.615621 956.226440 962.151306 421.954071 0.0 100.0 2018-08-04 08:14:55.070 POLYGON ((72.25113 40.11864, 72.26632 40.73514, 70.39597 43.25657, 67.88303 41.64029, 67.98027 39.17581, 71.62983 39.72949, 72.25113 40.11864)) [nan] 0 [8508.0, 8504.0, 8507.0, 8503.0, 8511.0] 5
{% endraw %}

LOF & DBSCAN Peaks

We set a capture zone and spatially join the data for CR's filtered peaks.

{% raw %}
# Remove far east and west to avoid side effects on captures zones 
peaks_cr = peak_cr.loc[(peak_cr["longitude"] < 175) & (peak_cr["longitude"] > -175), :]
peaks_cr['geometry'] = peaks_cr.apply(capture_df, axis=1)
peaks_cr_and_sources = join_and_count(peaks_cr, cities, plants)
peaks_cr_and_sources.head()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure surface_pressure_apriori land_water_indicator land_fraction surf_pres y_class_lof outlier_score_lof y_class_dbscan y_class_lof_only_gaussian_param y_class_dbscan_only_gaussian_param geometry index_cities number_cities index_plants number_plants
0 2018080102530302 58.858116 143.781662 21710 0.003719 399.445056 -0.018193 -5.552475 0.001307 0.730703 -3.837832 0.812698 995.894531 995.617310 1.0 0.0 -0.277222 1 -1.290330 -1 -1 -1 POLYGON ((142.33892 59.00768, 142.64085 58.38422, 146.34492 56.14489, 149.41173 58.10277, 148.66427 60.64502, 143.07774 59.52757, 142.33892 59.00768)) [nan] 0 [nan] 0
1 2018080104244674 35.680077 130.219467 21711 -0.004196 405.367275 9.859749 24.716153 0.159146 0.577045 -2.515345 -1.818006 1005.533569 1003.426147 1.0 0.0 -2.107422 1 -0.996306 1 1 1 POLYGON ((129.62193 35.32466, 130.16638 35.07742, 133.07010 35.05573, 132.66330 37.07097, 130.44018 38.08984, 129.50007 35.82555, 129.62193 35.32466)) [nan] 0 [nan] 0
2 2018080104244836 35.752354 130.197632 21711 -0.004529 405.341883 7.635088 6.525770 0.466759 0.511861 -2.536525 -1.833167 1006.090515 1003.472229 1.0 0.0 -2.618286 1 -1.027492 1 1 1 POLYGON ((129.59455 35.39396, 130.14403 35.14465, 133.07450 35.12231, 132.66476 37.15454, 130.42059 38.18230, 129.47154 35.89904, 129.59455 35.39396)) [nan] 0 [nan] 0
3 2018080104244906 35.789249 130.185776 21711 -0.004221 405.311837 7.751707 9.587837 0.322542 0.533806 -2.545387 -1.840344 1004.246460 1003.495850 1.0 0.0 -0.750610 1 -1.016994 1 1 1 POLYGON ((129.58032 35.42945, 130.13210 35.17933, 133.07449 35.15722, 132.66290 37.19680, 130.40907 38.22806, 129.45666 35.93634, 129.58032 35.42945)) [nan] 0 [nan] 0
4 2018080104244976 35.826172 130.173950 21711 -0.003825 405.271937 9.293184 16.454924 0.225309 0.514400 -2.552335 -1.846606 1005.441772 1003.519531 1.0 0.0 -1.922241 1 -1.009979 1 1 1 POLYGON ((129.56658 35.46514, 130.12033 35.21443, 133.07272 35.19288, 132.65920 37.23841, 130.39705 38.27224, 129.44227 35.97350, 129.56658 35.46514)) [nan] 0 [nan] 0
{% endraw %}

K-MEAN Peaks

We set a capture zone and spatially join the data for Charlotte's clustered peaks.

{% raw %}
# Remove far east and west to avoid side effects on captures zones 
peaks_kmean = peak_kmean.loc[(peak_kmean["longitude"] < 175) & (peak_kmean["longitude"] > -175), :]
peaks_kmean['geometry'] = peaks_kmean.apply(capture_df, axis=1)
peaks_kmean_and_sources = join_and_count(peaks_kmean, cities, plants)
peaks_kmean_and_sources.head()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
orbit R amplitude delta intercept latitude longitude sigma slope sounding_id windspeed_u windspeed_v date latitude_orig longitude_orig distance surface_pressure surface_pressure_apriori land_water_indicator land_fraction surf_pres kmeans_cluster geometry index_cities number_cities index_plants number_plants
0 21711 0.577045 9.859754 0.159146 405.367275 35.680077 130.219467 24.716167 -0.004196 2018080104244674 -2.515345 -1.818006 2018-08-01 04:24:46.740 35.680077 130.219467 0.000000 1005.533569 1003.426147 1.0 0.0 -2.107422 3 POLYGON ((129.62193 35.32466, 130.16638 35.07742, 133.07010 35.05573, 132.66330 37.07097, 130.44018 38.08984, 129.50007 35.82555, 129.62193 35.32466)) [nan] 0 [nan] 0
1 21711 0.511861 7.635086 0.466758 405.341883 35.752354 130.197632 6.525779 -0.004529 2018080104244836 -2.536525 -1.833167 2018-08-01 04:24:48.360 35.680077 130.219467 8.269885 1006.090515 1003.472229 1.0 0.0 -2.618286 0 POLYGON ((129.59455 35.39396, 130.14403 35.14465, 133.07450 35.12231, 132.66476 37.15454, 130.42059 38.18230, 129.47154 35.89904, 129.59455 35.39396)) [nan] 0 [nan] 0
2 21711 0.533806 7.751707 0.322542 405.311837 35.789249 130.185776 9.587837 -0.004221 2018080104244906 -2.545387 -1.840344 2018-08-01 04:24:49.060 35.680077 130.219467 12.506704 1004.246460 1003.495850 1.0 0.0 -0.750610 0 POLYGON ((129.58032 35.42945, 130.13210 35.17933, 133.07449 35.15722, 132.66290 37.19680, 130.40907 38.22806, 129.45666 35.93634, 129.58032 35.42945)) [nan] 0 [nan] 0
3 21711 0.514400 9.293186 0.225309 405.271937 35.826172 130.173950 16.454931 -0.003825 2018080104244976 -2.552335 -1.846606 2018-08-01 04:24:49.760 35.680077 130.219467 16.745752 1005.441772 1003.519531 1.0 0.0 -1.922241 0 POLYGON ((129.56658 35.46514, 130.12033 35.21443, 133.07272 35.19288, 132.65920 37.23841, 130.39705 38.27224, 129.44227 35.97350, 129.56658 35.46514)) [nan] 0 [nan] 0
4 21711 0.616669 8.763454 0.195147 405.187786 35.966625 130.125778 17.915272 -0.004961 2018080104245173 -2.563400 -1.864844 2018-08-01 04:24:51.730 35.680077 130.219467 32.942705 1004.669678 1003.609253 1.0 0.0 -1.060425 3 POLYGON ((129.51473 35.60203, 130.07370 35.35097, 133.04967 35.33520, 132.62691 37.39267, 130.34257 38.42837, 129.38775 36.11327, 129.51473 35.60203)) [nan] 0 [5141.0] 1
{% endraw %}

Map Visualization

Inventory Map

To display only the inventory data on a folium map:

{% raw %}
def inventory_map():
    # Initialize Map
    inventory_map = folium.Map([43, 0], zoom_start=4)
    folium.TileLayer("CartoDB dark_matter", name="Dark mode").add_to(inventory_map)

    # Adding Power plants
    plants_group = folium.FeatureGroup(name="Plants").add_to(inventory_map)
    for index, row in plants.iterrows():
        radius = row['estimated_generation_gwh']/10000
        color="#3186CC" # blue

        tooltip =  "["+str(round(row['latitude'],2))+" ; "+str(round(row['longitude'],2))+"]"
        emit = str(round(row['estimated_generation_gwh'],2))
        popup_html="""<h4>"""+tooltip+"""</h4>"""+row['country_long']+"""<p><b>Emission 2018 (est):</b> """+emit+""" GWh</p>"""
        popup=folium.Popup(popup_html, max_width=450)

        plants_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                      row["longitude"]),
                            radius=radius,
                            color=color,
                            popup=popup,
                            tooltip=tooltip,
                            fill=True))
        
    # Adding Coal plants
    plants_coal_group = folium.FeatureGroup(name="Coal Plants").add_to(inventory_map)
    for index, row in plants_coal.iterrows():
        radius = row['Annual CO2 emissions (millions of tonnes) in 2018']/10000
        color="#3186CC" # blue

        # tooltip =  "["+str(round(row['Latitude'],2))+" ; "+str(round(row['Longitude'],2))+"]"
        # emit = str(round(row['Annual CO2 emissions (millions of tonnes) in 2018'],2))
        # popup_html="""<h4>"""+tooltip+"""</h4>"""+str(row['Plant'])+"""<p><b>Emission 2018 (est):</b> """+emit+""" GWh</p>"""
        # popup=folium.Popup(popup_html, max_width=450)

        plants_coal_group.add_child(folium.CircleMarker(location=(row["Latitude"],
                                      row["Longitude"]),
                            radius=radius,
                            color=color,
                            popup=popup,
                            tooltip=tooltip,
                            fill=True))



    # Adding Cities
    cities_group = folium.FeatureGroup(name="Cities").add_to(inventory_map)
    for index, row in cities.iterrows():
        radius = row['Population (CDP)']/2000000
        color="#FEF65B" # yellow

        tooltip =  "["+str(round(row['latitude'],2))+" ; "+str(round(row['longitude'],2))+"]"
        pop = str(round(row['Population (CDP)'],0))
        title = "" + str(row['City name']) + ", " + str(row['Country'])
        popup_html="""<h4><b>"""+row["City name"]+"""</b>, """+row["Country"]+"""</h4>"""+"""<p>"""+tooltip+"""</p>"""+"""<p>Population 2017: """+pop+"""</p>"""
        popup_html = """<h4>"""+title+"""</h4><p>"""+tooltip+"""</p>"""+"""<p><b>Population 2017:</b> """+pop+"""</p>"""
        popup=folium.Popup(popup_html, max_width=450)

        cities_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                      row["longitude"]),
                            radius=radius,
                            color=color,
                            popup=popup,
                            tooltip=tooltip,
                            fill=True))


    folium.map.LayerControl(collapsed=False).add_to(inventory_map)

    plugins.Fullscreen(
        position='topright',
        title='Expand me',
        title_cancel='Exit me',
        force_separate_button=True
    ).add_to(inventory_map)

    minimap = plugins.MiniMap()
    inventory_map.add_child(minimap)

    inventory_map.save("inventory_map.html")
    return inventory_map

inventory_map()
Make this Notebook Trusted to load map: File -> Trust Notebook
{% endraw %}

Map Plot to compare peak detection techniques

To display only the different detected peaks on a folium map:

{% raw %}
def peaks_map():
    # Initialize Map
    peaks_methods = folium.Map([43, 0], zoom_start=4)
    folium.TileLayer("CartoDB dark_matter", name="Dark mode").add_to(peaks_methods)

    # Adding detected peaks
    peaks_group = folium.FeatureGroup(name="Peaks").add_to(peaks_methods)
    for index, row in peak_cr.iterrows():
        radius = row["amplitude"]/20
        color="#FF3333" # red
        
        peaks_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                      row["longitude"]),
                            radius=radius,
                            color=color,
                            fill=True))


    # Adding LOF detected peaks
    lof_group = folium.FeatureGroup(name="LOF detection").add_to(peaks_methods)
    for index, row in peak_cr.iterrows():
        if (row['y_class_lof_only_gaussian_param'] < 0):
          radius = row["amplitude"]/20
          color="#FF66CC" # pink
          
          lof_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=radius,
                              color=color,
                              fill=True))

    # Adding DBSCAN detected peaks
    dbscan_group = folium.FeatureGroup(name="DBSCAN detection").add_to(peaks_methods)
    for index, row in peak_cr.iterrows():
        if (row['y_class_dbscan_only_gaussian_param'] < 0):
          radius = row["amplitude"]/20
          color="#6A287E" # purple

          dbscan_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=radius,
                              color=color,
                              fill=True))


    folium.map.LayerControl(collapsed=False).add_to(peaks_methods)

    plugins.Fullscreen(
        position='topright',
        title='Expand me',
        title_cancel='Exit me',
        force_separate_button=True
    ).add_to(peaks_methods)

    minimap = plugins.MiniMap()
    peaks_methods.add_child(minimap)

    peaks_methods.save("peaks_methods.html")
    return peaks_methods

peaks_map()
Make this Notebook Trusted to load map: File -> Trust Notebook
{% endraw %}

KMEAN Clusters Map

{% raw %}
def peaks_map():
    # Initialize Map
    peaks_methods = folium.Map([43, 0], zoom_start=4)
    folium.TileLayer("CartoDB dark_matter", name="Dark mode").add_to(peaks_methods)

    # Adding all detected peaks
    peaks_group = folium.FeatureGroup(name="Peaks").add_to(peaks_methods)
    for index, row in peak_cr.iterrows():
        radius = row["amplitude"]/20
        color="#FF3333" # red
        
        peaks_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                      row["longitude"]),
                            radius=radius,
                            color=color,
                            fill=True))


    # Adding LOF detected peaks
    lof_group = folium.FeatureGroup(name="LOF detection").add_to(peaks_methods)
    for index, row in peak_cr.iterrows():
        if (row['y_class_lof_only_gaussian_param'] < 0):
          radius = row["amplitude"]/20
          color="#FF66CC" # pink
          
          lof_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=radius,
                              color=color,
                              fill=True))

    # Adding DBSCAN detected peaks
    dbscan_group = folium.FeatureGroup(name="DBSCAN detection").add_to(peaks_methods)
    for index, row in peak_cr.iterrows():
        if (row['y_class_dbscan_only_gaussian_param'] < 0):
          radius = row["amplitude"]/20
          color="#6A287E" # purple

          dbscan_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=radius,
                              color=color,
                              fill=True))
          
    # Adding KMEANS
    cluster0_group = folium.FeatureGroup(name="K-Means - Cluster 0").add_to(peaks_methods)
    cluster1_group = folium.FeatureGroup(name="K-Means - Cluster 1").add_to(peaks_methods)
    cluster2_group = folium.FeatureGroup(name="K-Means - Cluster 2").add_to(peaks_methods)
    cluster3_group = folium.FeatureGroup(name="K-Means - Cluster 3").add_to(peaks_methods)
    cluster4_group = folium.FeatureGroup(name="K-Means - Cluster 4").add_to(peaks_methods)
    for index, row in peak_kmean.iterrows():
        if (row['kmeans_cluster'] == 0):
          color="#4CBB17" # green
          cluster0_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=1,
                              color=color,
                              fill=True))
          
        if (row['kmeans_cluster'] == 1):
          color="#FFA62F" # yellow
          cluster1_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=1,
                              color=color,
                              fill=True))
          
        if (row['kmeans_cluster'] == 2):
          color="#2557C7" # blue
          cluster2_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=1,
                              color=color,
                              fill=True))
          
        if (row['kmeans_cluster'] == 3):
          color="#8A4117" # brown
          cluster3_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=1,
                              color=color,
                              fill=True))
        
        if (row['kmeans_cluster'] == 4):
          color="#8A4117" # light blue
          cluster4_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                        row["longitude"]),
                              radius=1,
                              color=color,
                              fill=True))
        


    folium.map.LayerControl(collapsed=False).add_to(peaks_methods)

    plugins.Fullscreen(
        position='topright',
        title='Expand me',
        title_cancel='Exit me',
        force_separate_button=True
    ).add_to(peaks_methods)

    minimap = plugins.MiniMap()
    peaks_methods.add_child(minimap)

    peaks_methods.save("peaks_methods.html")
    return peaks_methods

peaks_map()
Make this Notebook Trusted to load map: File -> Trust Notebook
{% endraw %}

Map with "Capture Zone"

{% raw %}
def peaks_capture_map(peaks):
    # Initialize Map
    peaks_capture = folium.Map([40, -100], zoom_start=4)
    folium.TileLayer("CartoDB dark_matter", name="Dark mode").add_to(peaks_capture)

    # Adding Power plants
    plants_group = folium.FeatureGroup(name="Plants").add_to(peaks_capture)
    for index, row in plants.iterrows():
        color="#999900" 
        radius = row['estimated_generation_gwh']/10000
        tooltip =  "["+str(round(row['latitude'],2))+" ; "+str(round(row['longitude'],2))+"]"
        plants_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                      row["longitude"]),
                            radius=0.1,
                            color=color,
                            tooltip=tooltip,
                            fill=True))
        
    # Adding Cities 
    cities_group = folium.FeatureGroup(name="Cities").add_to(peaks_capture)
    for index, row in cities.iterrows():
        color="#990099" 
        radius = row['Population (CDP)']/2000000
        tooltip =  "["+str(round(row['latitude'],2))+" ; "+str(round(row['longitude'],2))+"]"
        cities_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                      row["longitude"]),
                            radius=radius,
                            color=color,
                            tooltip=tooltip,
                            fill=True))

    # Adding detected peaks
    peaks_group = folium.FeatureGroup(name="Peaks").add_to(peaks_capture)
    peaks_group_capture = folium.FeatureGroup(name=" - 50km CirclesCapture Zone").add_to(peaks_capture)
    for index, row in peaks.iterrows():
        radius = row["amplitude"]/20
        color="#FF3333" # red
        sounding = str(row['sounding_id'])
        date = str(row['date'])
        orbit = str(row['orbit'])  
        
        peaks_group.add_child(folium.CircleMarker(location=(row["latitude"],
                                      row["longitude"]),
                            radius=radius,
                            color=color,
                            tooltip=sounding,
                            fill=True))

        peaks_group_capture.add_child(folium.GeoJson(row['geometry'], name=" - Capture Zone"))

    folium.map.LayerControl(collapsed=False).add_to(peaks_capture)
    peaks_capture.save("peaks_capture_map.html")
    return peaks_capture

peaks_capture_map(peaks_fc_and_sources[0:1])
Make this Notebook Trusted to load map: File -> Trust Notebook
{% endraw %}

Examples, Sanity Checks and Statistics

{% raw %}
c = folium.Map([40, 35 ], zoom_start=6)
folium.GeoJson(peaks_fc_and_sources.iloc[0:1].geometry , name=" - Capture Zone").add_to(c)
c
Make this Notebook Trusted to load map: File -> Trust Notebook
{% endraw %}

Sanity Check

The above figure shows 4 cities and a lot of plants in the capture zone of the peak, we should retrieve them in the count:

{% raw %}
# Sanity check with the figure above - OK 
peaks_fc_and_sources[0:1]
sounding_id latitude longitude orbit slope intercept amplitude sigma delta R windspeed_u windspeed_v surface_pressure_apriori surface_pressure altitude land_water_indicator land_fraction date geometry index_cities number_cities index_plants number_plants
0 2018080111005177 37.444622 30.755703 21715 0.006885 404.427158 64.732216 11.112728 2.32386 0.654418 0.284106 -0.315598 963.941162 964.808594 367.765533 0.0 100.0 2018-08-01 11:00:51.770 POLYGON ((30.82499 37.38318, 30.85930 37.44745, 30.81370 37.77195, 30.47740 37.68998, 30.34141 37.43239, 30.74128 37.36278, 30.82499 37.38318)) [nan] 0 [nan] 0
{% endraw %}

Statistics

We see that for the 183 peaks analyzed, a significant amount of capture zones contains either cities, plants or both.

{% raw %}
df = peaks_fc_and_sources[['number_cities', 'number_plants']]
print(" --- BASE DETECTION WITHOUT FILTER (Benoit) --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- BASE DETECTION WITHOUT FILTER (Benoit) --- 
Total count:  183
Cities mean and std:  0.27  x  1.19
Cities non-null count:  15  ( 8.2 %)
Plants mean and std:  10.08  x  20.14
Plants non-null count:  110  ( 60.11 %)
{% endraw %} {% raw %}
df = peaks_cr_and_sources[['number_cities', 'number_plants']]
print(" --- BASE DETECTION WITHOUT FILTER (Raphaele & Charlotte) --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- BASE DETECTION WITHOUT FILTER (Raphaele & Charlotte) --- 
Total count:  2658
Cities mean and std:  0.3  x  1.74
Cities non-null count:  211  ( 7.94 %)
Plants mean and std:  9.52  x  48.23
Plants non-null count:  1047  ( 39.39 %)
{% endraw %} {% raw %}
peaks_cr_lof = peaks_cr_and_sources[peaks_cr_and_sources['y_class_lof_only_gaussian_param'] < 0]
df = peaks_cr_lof[['number_cities', 'number_plants']]
print(" --- DETECTION WITH LOF FILTER --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- DETECTION WITH LOF FILTER --- 
Total count:  51
Cities mean and std:  1.0  x  2.25
Cities non-null count:  11  ( 21.57 %)
Plants mean and std:  21.9  x  65.93
Plants non-null count:  25  ( 49.02 %)
{% endraw %} {% raw %}
peaks_cr_dbscan = peaks_cr_and_sources[peaks_cr_and_sources['y_class_dbscan_only_gaussian_param'] < 0]
df = peaks_cr_dbscan[['number_cities', 'number_plants']]
print(" --- DETECTION WITH DBSCAN FILTER --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- DETECTION WITH DBSCAN FILTER --- 
Total count:  128
Cities mean and std:  0.41  x  1.47
Cities non-null count:  13  ( 10.16 %)
Plants mean and std:  13.43  x  36.28
Plants non-null count:  74  ( 57.81 %)
{% endraw %} {% raw %}
peaks_kmean_0 = peaks_kmean_and_sources[peaks_kmean_and_sources['kmeans_cluster'] == 0]
df = peaks_kmean_0[['number_cities', 'number_plants']]
print(" --- DETECTION WITH KMEAN FILTER (Cluster 0) --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- DETECTION WITH KMEAN FILTER (Cluster 0) --- 
Total count:  992
Cities mean and std:  0.33  x  2.3
Cities non-null count:  69  ( 6.96 %)
Plants mean and std:  10.23  x  62.76
Plants non-null count:  370  ( 37.3 %)
{% endraw %} {% raw %}
peaks_kmean_1 = peaks_kmean_and_sources[peaks_kmean_and_sources['kmeans_cluster'] == 1]
df = peaks_kmean_1[['number_cities', 'number_plants']]
print(" --- DETECTION WITH KMEAN FILTER (Cluster 1) --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- DETECTION WITH KMEAN FILTER (Cluster 1) --- 
Total count:  108
Cities mean and std:  0.51  x  1.64
Cities non-null count:  14  ( 12.96 %)
Plants mean and std:  16.72  x  45.55
Plants non-null count:  67  ( 62.04 %)
{% endraw %} {% raw %}
peaks_kmean_2 = peaks_kmean_and_sources[peaks_kmean_and_sources['kmeans_cluster'] == 2]
df = peaks_kmean_2[['number_cities', 'number_plants']]
print(" --- DETECTION WITH KMEAN FILTER (Cluster 2) --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- DETECTION WITH KMEAN FILTER (Cluster 2) --- 
Total count:  509
Cities mean and std:  0.26  x  1.16
Cities non-null count:  46  ( 9.04 %)
Plants mean and std:  7.72  x  25.02
Plants non-null count:  215  ( 42.24 %)
{% endraw %} {% raw %}
peaks_kmean_3 = peaks_kmean_and_sources[peaks_kmean_and_sources['kmeans_cluster'] == 3]
df = peaks_kmean_3[['number_cities', 'number_plants']]
print(" --- DETECTION WITH KMEAN FILTER (Cluster 3) --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- DETECTION WITH KMEAN FILTER (Cluster 3) --- 
Total count:  827
Cities mean and std:  0.26  x  1.38
Cities non-null count:  63  ( 7.62 %)
Plants mean and std:  7.57  x  33.27
Plants non-null count:  281  ( 33.98 %)
{% endraw %} {% raw %}
peaks_kmean_4 = peaks_kmean_and_sources[peaks_kmean_and_sources['kmeans_cluster'] == 4]
df = peaks_kmean_4[['number_cities', 'number_plants']]
print(" --- DETECTION WITH KMEAN FILTER (Cluster 4) --- ")
print("Total count: ", df.number_cities.count())
print("Cities mean and std: ", round(df.number_cities.mean(),2), " x ", round(df.number_cities.std(),2))
print("Cities non-null count: ", df.number_cities[df.number_cities > 0].count(), " (", round(df.number_cities[df.number_cities > 0].count()/df.number_cities.count()*100,2), "%)")
print("Plants mean and std: ", round(df.number_plants.mean(),2), " x ", round(df.number_plants.std(),2))
print("Plants non-null count: ", df.number_plants[df.number_plants > 0].count(), " (", round(df.number_plants[df.number_plants > 0].count()/df.number_plants.count()*100,2), "%)")
 --- DETECTION WITH KMEAN FILTER (Cluster 4) --- 
Total count:  19
Cities mean and std:  0.26  x  0.93
Cities non-null count:  2  ( 10.53 %)
Plants mean and std:  22.53  x  27.18
Plants non-null count:  18  ( 94.74 %)
{% endraw %}